# --------------------------------
# setup
# --------------------------------

# libraries
library(dplyr)
library(readxl)

# --------------------------------
# load data
# --------------------------------

# interview transcripts
hcf <- read.csv('data/inputs/HCF-1998-10-18 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HCF')
hsr1 <- read.csv('data/inputs/HSR-1997-05-19 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr2 <- read.csv('data/inputs/HSR-1998-11-01 procesado.csv', sep = ';', skip = 10) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr3 <- read.csv('data/inputs/HSR-1998-12-03 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
granier_data <- rbind(hcf, hsr1, hsr2, hsr3)
rm(hcf, hsr1, hsr2, hsr3)

# --------------------------------
# pre-process
# --------------------------------
granier_data$text <- tolower(chartr(iconv("ãâàèìòùáéíóöúüûñÀÈÌÒÙÁÉÍÓÚÑ", to = "UTF-8"), "aaaeiouaeioouuunAEIOUAEIOUN", granier_data$text)) # remove accents
#granier_data$text <- gsub("[^a-zA-Z]", " ", granier_data$text)  # keep only text
granier_data$speaker[granier_data$speaker == "Hugo Chávez"] <- "Hugo Chavez"

# --------------------------------
# create and prune vocab
# --------------------------------

# stopwords
stopwords_es <- unique(stopwords::data_stopwords_stopwordsiso$es, stopwords::data_stopwords_snowball$es)
stopwords_es <- lapply(stopwords_es, function(x) chartr(iconv("ãâàèìòùáéíóöúüûñÀÈÌÒÙÁÉÍÓÚÑ", to = "UTF-8"), "aaaeiouaeioouuunAEIOUAEIOUN", x)) %>% unlist
stopwords_es <- gsub("[^a-zA-Z]", "", stopwords_es)  # keep only text
stopwords_es <- stopwords_es[stopwords_es!=""]

# save output
saveRDS(granier_data, "data/outputs/granier_data.rds")
saveRDS(stopwords_es, "data/outputs/stopwords_es.rds")
